// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
/**
 * Unicode utilities.
 */
#pragma once

#include <sys/types.h>

#define FASTLIB_UNICODEUTIL_USES_WORDCHARPROP


/** utf8_t is the type of the multi-byte UTF-8 character components */
typedef unsigned char utf8_t;
/** ucs4_t is the type of the 4-byte UCS4 characters */
typedef unsigned int ucs4_t;

/**
 * Utility class for unicode character handling.
 * Used to examine properties of unicode characters, and
 * provide fast conversion methods between often used encodings.
 */
class Fast_UnicodeUtil {
private:
    /**
     * Table for easy lookup of UTF8 character length in bytes
     */
    static unsigned char _utf8header[256];
    /**
     * Is true when the tables have been initialized. Is set by
     * InitTables, and should be protected by the _initMutex before
     * inspection.
     */

    /** Two-level lowercase table. 256 pages, 256 elements each.
     * This table is defined in unicode-lowercase.cpp, which is
     * autogenerated by the extcase application. */
    static unsigned short *_compLowerCase[256];

    /** Two-level character property table. 256 pages with 256 elements each.
     * This table is defined in unicode-charprops.cpp, which is
     * autogenerated by the extprop applicatoin. */
    static unsigned char *_compCharProps[256];


    /** The property bit identificators */
    enum {
        _spaceProp = 1,
        _wordcharProp = 2,
        _ideographicProp = 4,
        _decimalDigitCharProp = 8,
        _ignorableControlCharProp = 16,
        _terminalPunctuationCharProp = 32
    };

public:
    virtual ~Fast_UnicodeUtil() { }
    /** Initialize the ISO 8859-1 static tables. */
    static void InitTables();

    /** Indicates an invalid UTF-8 character sequence. */
    enum { _BadUTF8Char = 0xfffffffeu };

    /**
     * Test for word character. Characters with certain unicode properties
     * are recognized as word characters. In addition to this, all
     * characters with the custom _FASTWordProp is regarded as a word
     * character. The previous range in _privateUseProp is included
     * in the _FASTWordProp set of ranges.
     * @param testchar the UCS4 character to test.
     * @return true if testchar is a word character, i.e. if it has
     * one or more of the properties alphabetic, ideographic,
     * combining char, decimal digit char, private use, extender.
     */
    static bool IsWordChar(ucs4_t testchar) {
        return (testchar < 65536 &&
                (_compCharProps[testchar >> 8][testchar & 255] &
                 _wordcharProp) != 0);
    }

    /**
     * Get the next UCS4 character from an UTF-8 string buffer.
     * Modify the src pointer to allow future calls.
     * @param src The address of a pointer to the current position
     *            in the UTF-8 string.
     * @param length The maximum allowed length of the byte sequence.
     *               -1 means no check.
     * @return The next UCS4 character, or _BadUTF8Char if the
     *         next character is invalid.
     */
    static ucs4_t GetUTF8Char(const unsigned char *& src);
    static ucs4_t GetUTF8Char(const char *& src) {
        const unsigned char *temp = reinterpret_cast<const unsigned char *>(src);
        ucs4_t res = GetUTF8Char(temp);
        src = reinterpret_cast<const char *>(temp);
        return res;
    }

    /**
     * Put an UCS4 character into a buffer as an UTF-8 representation.
     * @param dst The destination buffer.
     * @param i The UCS4 character.
     * @return Pointer to the next position in dst after the putted byte(s).
     */
    static char *utf8cput(char *dst, ucs4_t i) {
        if (i < 128)
            *dst++ = i;
        else if (i < 0x800) {
            *dst++ = (i >> 6) | 0xc0;
            *dst++ = (i & 63) | 0x80;
        } else if (i < 0x10000) {
            *dst++ = (i >> 12) | 0xe0;
            *dst++ = ((i >> 6) & 63) | 0x80;
            *dst++ = (i & 63) | 0x80;
        } else if (i < 0x200000) {
            *dst++ = (i >> 18) | 0xf0;
            *dst++ = ((i >> 12) & 63) | 0x80;
            *dst++ = ((i >> 6) & 63) | 0x80;
            *dst++ = (i & 63) | 0x80;
        } else if (i < 0x4000000) {
            *dst++ = (i >> 24) | 0xf8;
            *dst++ = ((i >> 18) & 63) | 0x80;
            *dst++ = ((i >> 12) & 63) | 0x80;
            *dst++ = ((i >> 6) & 63) | 0x80;
            *dst++ = (i & 63) | 0x80;
        } else {
            *dst++ = (i >> 30) | 0xfc;
            *dst++ = ((i >> 24) & 63) | 0x80;
            *dst++ = ((i >> 18) & 63) | 0x80;
            *dst++ = ((i >> 12) & 63) | 0x80;
            *dst++ = ((i >> 6) & 63) | 0x80;
            *dst++ = (i & 63) | 0x80;
        }
        return dst;
    }

    /**
     * Copy an UTF-8 string into an UCS4 string.
     * @param dst The UCS4 destination buffer.
     * @param src The UTF-8 source buffer.
     * @return A pointer to the destination string.
     */
    static ucs4_t *ucs4copy(ucs4_t *dst, const char *src);

    /**
     * Get the length of the UTF-8 representation of an UCS4 character.
     * @param i The UCS4 character.
     * @return The number of bytes required for the UTF-8 representation.
     */
    static size_t utf8clen(ucs4_t i) {
        if (i < 128)
            return 1;
        else if (i < 0x800)
            return 2;
        else if (i < 0x10000)
            return 3;
        else if (i < 0x200000)
            return 4;
        else if (i < 0x4000000)
            return 5;
        else
            return 6;
    }

    /**
     * Lowercase an UCS4 character.
     * @param testchar The character to lowercase.
     * @return The lowercase of the input, if defined. Else the input character.
     */
    static ucs4_t ToLower(ucs4_t testchar)
    {
        ucs4_t ret;
        if (testchar < 65536) {
            ret = _compLowerCase[testchar >> 8][testchar & 255];
            if (ret == 0)
                return testchar;
            return ret;
        } else
            return testchar;
    }

    /** Move forwards or backwards a number of characters within an UTF8 buffer
     * Modify pos to yield new position if possible
     * @param start A pointer to the start of the UTF8 buffer
     * @param length The length of the UTF8 buffer
     * @param pos A pointer to the current position within the UTF8 buffer,
     *            updated to reflect new position upon return
     * @param offset An offset (+/-) in number of UTF8 characters.
     *        Offset 0 means move to the start of the current character.
     * @return Number of bytes moved, or -1 if out of range
     */
    static int UTF8move(unsigned const char* start, size_t length,
                        unsigned const char*& pos, off_t offset);

    /**
     * Find the number of characters in an UCS4 string.
     * @param str The UCS4 string.
     * @return The number of characters.
     */
    static size_t ucs4strlen(const ucs4_t *str);

    /**
     * Convert UCS4 to UTF-8, bounded by max lengths.
     * @param dst The destination buffer for the UTF-8 string.
     * @param src The source UCS4 string.
     * @param maxdst The maximum number of bytes to put into dst.
     * @param maxsrc The maximum number of characters to convert from src.
     * @return A pointer to the destination.
     */
    static char *utf8ncopy(char *dst, const ucs4_t *src, int maxdst, int maxsrc);


    /**
     * Compare an UTF-8 string to a UCS4 string, analogous to strcmp(3).
     * @param s1 The UTF-8 string.
     * @param s2 The UCS4 string.
     * @return An integer less than, equal to, or greater than zero,
     *        if s1 is, respectively, less than, matching, or greater than s2.
     * NB Only used in local test
     */
    static int utf8cmp(const char *s1, const ucs4_t *s2);

    /**
     * Copy an ISO-8859-1 string to an UTF-8 string.
     * @param src The source ISO-8859-1 string.
     * @return Pointer to a new alloacted buffer with the UTF-8 result.
     * NB Only use in local test
     */
    static char *strdupLAT1(const char *src);

    /**
     * Test for terminal punctuation.
     * @param testchar the UCS4 character to test.
     * @return true if testchar is a terminal punctuation character,
     *    i.e. if it has the terminal punctuation char property.
     */
    static bool IsTerminalPunctuationChar(ucs4_t testchar) {
        return (testchar < 65536 &&
                (_compCharProps[testchar >> 8][testchar & 255] &
                 _terminalPunctuationCharProp) != 0);
    }

    /**
     * Get the next UCS4 character from an UTF-8 string buffer.
     * We assume that the first character in the UTF-8 string is >= 0x80 (non-ascii).
     * Modify the src pointer to allow future calls.
     * @param src The address of a pointer to the current position
     *            in the UTF-8 string.
     * @return The next UCS4 character, or _BadUTF8Char if the
     *         next character is invalid.
     */
    static ucs4_t GetUTF8CharNonAscii(unsigned const char *&src);

    // this is really an alias of the above function
    static ucs4_t GetUTF8CharNonAscii(const char *&src) {
        unsigned const char *temp = reinterpret_cast<unsigned const char *>(src);
        ucs4_t res = GetUTF8CharNonAscii(temp);
        src = reinterpret_cast<const char *>(temp);
        return res;
    }
};
